Data Exploration - Bingo Aloha

Get packages:

## Warning: package 'tweedie' was built under R version 3.6.2
## Warning: package 'fitdistrplus' was built under R version 3.6.2
## Warning: package 'MASS' was built under R version 3.6.2
## Warning: package 'survival' was built under R version 3.6.2

Function Prep:

Matus Data:

df_matus = read.csv("/Users/PeterNovak/Desktop/ba_data.csv")

distribution_name = c("lnorm", "weibull", "frechet", "loglogis", "burr", "gamma", "exp")#, "tweedie")
colours = c("purple", "darkgreen", "dodgerblue", "orange", "red", "brown", "cyan")#, "pink")
fit_matus   = get_dist(df_matus$total_wins_spend,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

as.data.frame(fit_matus)
# Your rate
ks.test(df_matus$total_wins_spend, rexp(100000, rate = 1.319))
## Warning in ks.test(df_matus$total_wins_spend, rexp(1e+05, rate = 1.319)): p-
## value will be approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_matus$total_wins_spend and rexp(1e+05, rate = 1.319)
## D = 0.79521, p-value < 2.2e-16
## alternative hypothesis: two-sided
# My best rate
ks.test(df_matus$total_wins_spend, rexp(100000, rate = 0.0289886))
## Warning in ks.test(df_matus$total_wins_spend, rexp(1e+05, rate = 0.0289886)): p-
## value will be approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_matus$total_wins_spend and rexp(1e+05, rate = 0.0289886)
## D = 0.31311, p-value < 2.2e-16
## alternative hypothesis: two-sided
# My best distribution
ks.test(df_matus$total_wins_spend, rlnorm(100000, meanlog = 2.210660, sdlog = 1.516598))
## Warning in ks.test(df_matus$total_wins_spend, rlnorm(1e+05, meanlog = 2.21066, :
## p-value will be approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_matus$total_wins_spend and rlnorm(1e+05, meanlog = 2.21066, sdlog = 1.516598)
## D = 0.063347, p-value < 2.2e-16
## alternative hypothesis: two-sided

All Client Data:

df_bingo_aloha   = load_df("./data/data_bingo_aloha_30.csv")
df_homw          = load_df("./data/data_homw_30.csv")
df_idle_mafia    = load_df("./data/data_idle_mafia_30.csv")
df_spongebob     = load_df("./data/data_spongebob_30.csv")
df_terra_genesis = load_df("./data/data_terra_genesis_30.csv")
df_ultimex       = load_df("./data/data_ultimex_30.csv")
n_boots = 500
dist_bingo_aloha   = descdist(df_bingo_aloha$total_wins_spend, boot = n_boots)

dist_homw          = descdist(df_homw$total_wins_spend, boot = n_boots)

dist_idle_mafia    = descdist(df_idle_mafia$total_wins_spend, boot = n_boots)

dist_spongebob     = descdist(df_spongebob$total_wins_spend, boot = n_boots)

dist_terra_genesis = descdist(df_terra_genesis$total_wins_spend, boot = n_boots)

dist_ultimex       = descdist(df_ultimex$total_wins_spend, boot = n_boots)

distribution_name = c("lnorm", "weibull", "frechet", "loglogis", "burr", "gamma", "exp")#, "tweedie")
colours = c("purple", "darkgreen", "dodgerblue", "orange", "red", "brown", "cyan")#, "pink")

scale = 100 # otherwise exp fit doesnt work because e^x = inf  if  x > 710
fit_bingo_aloha   = get_dist(df_bingo_aloha$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_homw          = get_dist(df_homw$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_idle_mafia    = get_dist(df_idle_mafia$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_spongebob     = get_dist(df_spongebob$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_terra_genesis = get_dist(df_terra_genesis$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_ultimex       = get_dist(df_ultimex$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

raw_fit = list(fit_bingo_aloha, fit_homw, fit_idle_mafia, fit_spongebob, fit_terra_genesis, fit_ultimex)

mat_out2 = matrix(data = NA, nrow = 6, ncol = length(distribution_name),
                  dimnames = list(c("Bingo Aloha",
                                    "HOMW",
                                    "Idle Mafia",
                                    "Spongebob",
                                    "Terra Genesis",
                                    "Ultimate X-Poker"),
                                  distribution_name))
for (i in 1:nrow(mat_out2)) {
  temp_gof = raw_fit[[i]]
  for (j in 1:ncol(mat_out2)) {
    mat_out2[i, j] = as.numeric(temp_gof[j])
  }
}

as.data.frame(mat_out2)
val_mat = as.data.frame(rbind(
  cbind(rownames(fit_bingo_aloha),
        c(rep("Bingo Aloha", 4)),
        fit_bingo_aloha),
  cbind(rownames(fit_homw),
        c(rep("HOMW", 4)),
        fit_homw),
  cbind(rownames(fit_idle_mafia),
        c(rep("Idle Mafia", 4)),
        fit_idle_mafia),
  cbind(rownames(fit_spongebob),
        c(rep("Spongebob", 4)),
        fit_spongebob),
  cbind(rownames(fit_terra_genesis),
        c(rep("Terra Genesis", 4)),
        fit_terra_genesis),
  cbind(rownames(fit_ultimex),
        c(rep("Ultimate X-Poker", 4)),
        fit_ultimex)
))
## Warning in cbind(rownames(fit_bingo_aloha), c(rep("Bingo Aloha", 4)),
## fit_bingo_aloha): number of rows of result is not a multiple of vector length
## (arg 2)
## Warning in cbind(rownames(fit_homw), c(rep("HOMW", 4)), fit_homw): number of
## rows of result is not a multiple of vector length (arg 2)
## Warning in cbind(rownames(fit_idle_mafia), c(rep("Idle Mafia", 4)),
## fit_idle_mafia): number of rows of result is not a multiple of vector length
## (arg 2)
## Warning in cbind(rownames(fit_spongebob), c(rep("Spongebob", 4)),
## fit_spongebob): number of rows of result is not a multiple of vector length (arg
## 2)
## Warning in cbind(rownames(fit_terra_genesis), c(rep("Terra Genesis", 4)), :
## number of rows of result is not a multiple of vector length (arg 2)
## Warning in cbind(rownames(fit_ultimex), c(rep("Ultimate X-Poker", 4)),
## fit_ultimex): number of rows of result is not a multiple of vector length (arg
## 2)
colnames(val_mat)[1:2] = c("distribution", "client")
rownames(val_mat) = 1:nrow(val_mat)

val_mat$aic  = as.numeric(as.character(val_mat$aic))
val_mat$bic  = as.numeric(as.character(val_mat$bic))
val_mat$chsq = as.numeric(as.character(val_mat$chsq))
val_mat$ks   = as.numeric(as.character(val_mat$ks))

write.csv(val_mat, "./goodness_of_fit_per_client_raw.csv")

a = ggplot(
  val_mat, aes(x = distribution, y = aic, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./aic_plot_all.png", plot = a,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
a

b = ggplot(
  val_mat, aes(x = distribution, y = bic, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./bic_plot_all.png", plot = b,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
b

c = ggplot(
  val_mat, aes(x = distribution, y = chsq, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./chsq_plot_all.png", plot = c,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
c

d = ggplot(
  val_mat, aes(x = distribution, y = ks, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./ks_plot_all.png", plot = d,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
d

Same thing but with no scale parameter - so no exponential dist either:

All five of the following distributions used are left skewed distributions with similar properties… So results are bound to be similar.

distribution_name = c("lnorm", "weibull", "frechet", "loglogis", "burr")
colours = c("purple", "darkgreen", "dodgerblue", "orange", "red")

scale = 1
fit_bingo_aloha   = get_dist(df_bingo_aloha$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_homw          = get_dist(df_homw$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_idle_mafia    = get_dist(df_idle_mafia$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_spongebob     = get_dist(df_spongebob$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_terra_genesis = get_dist(df_terra_genesis$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

fit_ultimex       = get_dist(df_ultimex$total_wins_spend/scale,
                             distribution_name, colours, T, F, F)
## Warning in fitdist(data = input_list, method = "mle", distr = "frechet", : The
## pfrechet function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "loglogis", : The
## ploglogis function should have its first argument named: q as in base R
## Warning in fitdist(data = input_list, method = "mle", distr = "burr", start =
## list(a = 1, : The pburr function should have its first argument named: q as in
## base R

raw_fit = list(fit_bingo_aloha, fit_homw, fit_idle_mafia, fit_spongebob, fit_terra_genesis, fit_ultimex)

mat_out2 = matrix(data = NA, nrow = 6, ncol = length(distribution_name),
                  dimnames = list(c("Bingo Aloha",
                                    "HOMW",
                                    "Idle Mafia",
                                    "Spongebob",
                                    "Terra Genesis",
                                    "Ultimate X-Poker"),
                                  distribution_name))
for (i in 1:nrow(mat_out2)) {
  temp_gof = raw_fit[[i]]
  for (j in 1:ncol(mat_out2)) {
    mat_out2[i, j] = as.numeric(temp_gof[j])
  }
}

as.data.frame(mat_out2)
val_mat = as.data.frame(rbind(
  cbind(rownames(fit_bingo_aloha),
        c(rep("Bingo Aloha", nrow(fit_bingo_aloha))),
        fit_bingo_aloha),
  cbind(rownames(fit_homw),
        c(rep("HOMW", nrow(fit_homw))),
        fit_homw),
  cbind(rownames(fit_idle_mafia),
        c(rep("Idle Mafia", nrow(fit_idle_mafia))),
        fit_idle_mafia),
  cbind(rownames(fit_spongebob),
        c(rep("Spongebob", nrow(fit_spongebob))),
        fit_spongebob),
  cbind(rownames(fit_terra_genesis),
        c(rep("Terra Genesis", nrow(fit_terra_genesis))),
        fit_terra_genesis),
  cbind(rownames(fit_ultimex),
        c(rep("Ultimate X-Poker", nrow(fit_ultimex))),
        fit_ultimex)
))
colnames(val_mat)[1:2] = c("distribution", "client")
rownames(val_mat) = 1:nrow(val_mat)

val_mat$aic  = as.numeric(as.character(val_mat$aic))
val_mat$bic  = as.numeric(as.character(val_mat$bic))
val_mat$chsq = as.numeric(as.character(val_mat$chsq))
val_mat$ks   = as.numeric(as.character(val_mat$ks))

#write.csv(val_mat, "./goodness_of_fit_per_client_raw.csv")

a = ggplot(
  val_mat, aes(x = distribution, y = aic, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./aic_plot_all.png", plot = a,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
a

b = ggplot(
  val_mat, aes(x = distribution, y = bic, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./bic_plot_all.png", plot = b,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
b

c = ggplot(
  val_mat, aes(x = distribution, y = chsq, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./chsq_plot_all.png", plot = c,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
c

d = ggplot(
  val_mat, aes(x = distribution, y = ks, fill = distribution)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  facet_wrap(~ client, 1, 6)
#ggsave("./ks_plot_all.png", plot = d,
#  width = 36, height = 10, units = "cm", scale = 1.5
#)
d